Dataset from Airbnb Data Open Resources: http://insideairbnb.com/get-the-data.html
#############################################
# Library Requirement #
#############################################
library(tidytext) # Package tidytext for conversion of text to and from tidy formats
library(dplyr) # Package dplyr is for data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # Collection of R packages designed for data works harmoniously with other packages
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.3
## ✓ tibble 3.0.4 ✓ stringr 1.4.0
## ✓ tidyr 1.1.0 ✓ forcats 0.5.1
## ✓ readr 1.4.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readr) # Package readr is to provide a fast and friendly way to read rectangular data (like csv, tsv, and fwf).
# install.packages("visdat")
library(visdat) # Package for visulizing plot of the missing data
library(ggplot2) # Package for multiple ploting
library(DT) # Package for HTML display of data
library(corrplot) # Package for correlation analysis, confidence interval
## corrplot 0.84 loaded
# install.packages("hrbrthemes")
library(hrbrthemes)# A compilation of extra 'ggplot2' themes, scales and utilities, including a spell check function for plot label fields and an overall emphasis on typography
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(cowplot) # he "cowplot" package is a simple add-on to ggplot. It provides various features that help with creating publication-quality figures, such as a set of themes
# install.packages("webmap")
library(ggmap) # Package ggmap is a collection of functions to visualize spatial data and models on top of static maps from various online sources (e.g Google Maps)
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:cowplot':
##
## theme_nothing
#############################################
# Data Preparation #
#############################################
Airbnb_Shanghai_2021 <- read_csv("/Users/wqr/Desktop/MSDS\ 597\ Final\ Project/Shanghai\ Dataset/listings.csv")
##
## ── Column specification ─────────────────────────────────────────────────────────────────────────────────
## cols(
## id = col_double(),
## name = col_character(),
## host_id = col_double(),
## host_name = col_character(),
## neighbourhood_group = col_logical(),
## neighbourhood = col_character(),
## latitude = col_double(),
## longitude = col_double(),
## room_type = col_character(),
## price = col_double(),
## minimum_nights = col_double(),
## number_of_reviews = col_double(),
## last_review = col_date(format = ""),
## reviews_per_month = col_double(),
## calculated_host_listings_count = col_double(),
## availability_365 = col_double()
## )
head(Airbnb_Shanghai_2021, 100) # Check the dataset
## # A tibble: 100 x 16
## id name host_id host_name neighbourhood_g… neighbourhood latitude
## <dbl> <chr> <dbl> <chr> <lgl> <chr> <dbl>
## 1 24963 Hear… 98203 Jia NA 徐汇区 / Xuhui … 31.2
## 2 24991 Fren… 98203 Jia NA 徐汇区 / Xuhui … 31.2
## 3 139828 【sid… 681552 Leon NA 普陀区 / Putuo … 31.2
## 4 161932 Subl… 774393 Michael NA 静安区 / Jing'a… 31.2
## 5 185736 Apt … 891951 Maggie NA 徐汇区 / Xuhui … 31.2
## 6 350728 'Lao… 1777552 Nitin NA 长宁区 / Changn… 31.2
## 7 427038 In t… 2122588 Mia NA 黄浦区 / Huangp… 31.2
## 8 479517 有简约 … 681552 Leon NA 静安区 / Jing'a… 31.2
## 9 479530 【sid… 681552 Leon NA 静安区 / Jing'a… 31.2
## 10 496972 Free… 2454164 Alvin NA 杨浦区 / Yangpu… 31.3
## # … with 90 more rows, and 9 more variables: longitude <dbl>, room_type <chr>,
## # price <dbl>, minimum_nights <dbl>, number_of_reviews <dbl>,
## # last_review <date>, reviews_per_month <dbl>,
## # calculated_host_listings_count <dbl>, availability_365 <dbl>
dim(Airbnb_Shanghai_2021) # To understand the dimention of the dataset
## [1] 36294 16
vis_miss(Airbnb_Shanghai_2021) # To view the missing values in the dataset and creat a visulization graph
datatable(Airbnb_Shanghai_2021 ,extensions = 'Buttons', options = list(dom = 'Bfrtip', buttons = I('colvis')))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html
#############################################
# Correlation Metrics Analysis #
#############################################
Airbnb_Shanghai_2021_cor <- Airbnb_Shanghai_2021[, sapply(Airbnb_Shanghai_2021, is.numeric)]
Airbnb_Shanghai_2021_cor <- Airbnb_Shanghai_2021_cor[complete.cases(Airbnb_Shanghai_2021_cor), ]
correlation_matrix <- cor(Airbnb_Shanghai_2021_cor, method = "spearman")
corrplot(correlation_matrix, method = "color")
#############################################
# Exploratory Data Analysis #
#############################################
### Disctribution of Shanghai Airbnb Price
background_canvas<- theme(panel.grid.major =element_blank(),
panel.grid.minor =element_blank(),
panel.background =element_blank(),
axis.line.x =element_line(color ="black"),
axis.line.y =element_line(color ="black"),
legend.key =element_rect(fill ="white"),
text =element_text(size =15))
par(mfrow=c(2,1))
ggplot(Airbnb_Shanghai_2021) +
background_canvas+
geom_histogram(aes(price),fill = 'orange',alpha = 0.85,binwidth = 15) +
theme_minimal(base_size = 13) + xlab("Price") + ylab("Frequency") +
ggtitle("The Distrubition of Price in Shanghai 2021")
### Transformed distribution of Shanghai Airbnb Price with log10 transformation of x-axis
#Transformed distribution of Price
ggplot(Airbnb_Shanghai_2021, aes(price)) +
background_canvas+
geom_histogram(bins = 30, aes(y = ..density..), color = "black", fill = "orange") +
geom_density(alpha = 0.2, color = "red") +
ggtitle("Transformed distribution of price (Display in RMB, 1$≈6.5RMB )",
subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
scale_x_log10()+
geom_vline(xintercept = round(mean(Airbnb_Shanghai_2021$price), 2), size = 1, linetype = 3) +
scale_x_log10() +
annotate("text", x = 1800, y = 0.75,label = paste("Mean price = ", paste0(round(mean(Airbnb_Shanghai_2021$price), 2), "RMB")),
color = "#32CD32", size = 6)
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).
#############################################
# neighborhood mean price #
#############################################
airbnb_neighbourhood <- Airbnb_Shanghai_2021 %>%
group_by(neighbourhood) %>%
summarise(price = round(mean(price), 2))
ggplot(Airbnb_Shanghai_2021, aes(price)) +
geom_histogram(bins = 30, aes(y = ..density..), fill = "orange") +
geom_density(alpha = 0.2, fill = "red") +
background_canvas +
ggtitle("Transformed distribution of price\n by neighbourhood groups",
subtitle = expression("With" ~'log'[10] ~ "transformation of x-axis")) +
geom_vline(data = airbnb_neighbourhood, aes(xintercept = price), size = 1, linetype = 3) +
geom_text(data = airbnb_neighbourhood,y = 1.5, aes(x = price + 1400, label = paste("Mean = ",price)), color = "#32CD32", size = 3) +
facet_wrap(~neighbourhood) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).
#############################################
# Average Price by Different/Room Type #
#############################################
mean_room_type <- aggregate(list(average_price = Airbnb_Shanghai_2021$price),
list(room_type = Airbnb_Shanghai_2021$room_type), mean)
mean_room_type
## room_type average_price
## 1 Entire home/apt 923.8280
## 2 Hotel room 0.0000
## 3 Private room 523.0714
## 4 Shared room 399.3215
### Average Price by Room Type
ggplot(data = mean_room_type, aes(x=room_type, y=average_price))+
coord_flip()+
geom_segment(aes(xend=room_type, yend=0, color = room_type), size = 2) +
geom_point(size=6, mapping = aes(color = room_type))+
# theme_minimal()+
geom_text(aes(label=average_price), vjust = -1.5)+
background_canvas+
labs(title = "Average price by Room type in Shanghai 2021",
x = "Room Type", y = "Average Price")
#############################################
# Neighborhood Analysis #
#############################################
# Take a look at the number of rental airbnbs in each neighbourhood, It is a long list and in the picture below, only the higher frequency end is displayed.
freq_area <- data.frame(cbind(Frequency = table(Airbnb_Shanghai_2021$neighbourhood), Percent = prop.table(table(Airbnb_Shanghai_2021$neighbourhood)) * 100))
freq_area <- freq_area[order(freq_area$Frequency),]
freq_area
## Frequency Percent
## 金山区 / Jinshan District 179 0.4931945
## 奉贤区 / Fengxian District 277 0.7632116
## 宝山区 / Baoshan District 641 1.7661321
## 嘉定区 / Jiading District 808 2.2262633
## 普陀区 / Putuo District 828 2.2813688
## 杨浦区 / Yangpu District 888 2.4466854
## 虹口区 / Hongkou District 946 2.6064914
## 青浦区 / Qingpu District 1143 3.1492809
## 松江区 / Songjiang District 1232 3.3945005
## 崇明区 / Chongming District 1308 3.6039015
## 长宁区 / Changning District 1358 3.7416653
## 闵行区 / Minhang District 2317 6.3839753
## 静安区 / Jing'an District 2321 6.3949964
## 徐汇区 / Xuhui District 3671 10.1146195
## 黄浦区 / Huangpu District 4500 12.3987436
## 浦东新区 / Pudong 13877 38.2349700
tema <- theme(plot.title = element_text(size = 18, hjust = .5),
axis.text.x = element_text(size = 8, angle=45, face = "bold"),
axis.text.y = element_text(size = 8, angle=10, face = "bold"),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
legend.text = element_text(size = 14, face = "bold"))
freq_area_df <- data.frame(neighbourhood = row.names(tail(freq_area, 10)), Frequency = tail(freq_area, 10)$Frequency)
freq_area_df
## neighbourhood Frequency
## 1 虹口区 / Hongkou District 946
## 2 青浦区 / Qingpu District 1143
## 3 松江区 / Songjiang District 1232
## 4 崇明区 / Chongming District 1308
## 5 长宁区 / Changning District 1358
## 6 闵行区 / Minhang District 2317
## 7 静安区 / Jing'an District 2321
## 8 徐汇区 / Xuhui District 3671
## 9 黄浦区 / Huangpu District 4500
## 10 浦东新区 / Pudong 13877
options(repr.plot.width=20, repr.plot.height=10)
ggplot(data = freq_area_df, mapping = aes(x = neighbourhood, y = Frequency)) +
theme_minimal() +
geom_point(size = 4, color = "darkblue") +
ggtitle("TOP 10 most frequent neighbourhood in Shanghai City") +
xlab("") +
geom_line(color = "black", size = 1, linetype= 17, group = 2, alpha = .5) +
geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
tema
### Take a look at the top 10 neighborhoods
#### Top10 Highest Prices
top_10_neighbourhood <- aggregate(list(Airbnb_Shanghai_2021$price), list(Airbnb_Shanghai_2021$neighbourhood), mean)
colnames(top_10_neighbourhood) <- c("neighbourhood", "Average_price_per_neighborhood")
top_10_neighbourhood <- top_10_neighbourhood[order(top_10_neighbourhood$Average_price_per_neighborhood),]
top_10_neighbourhood <- tail(top_10_neighbourhood, 12)
top_10_neighbourhood <- head(top_10_neighbourhood, 10)
r <- c()
for(i in 10:1){r <- c(r, i)}
row.names(top_10_neighbourhood) <- r
top_10_neighbourhood
## neighbourhood Average_price_per_neighborhood
## 10 闵行区 / Minhang District 604.6832
## 9 静安区 / Jing'an District 643.1443
## 8 徐汇区 / Xuhui District 648.9120
## 7 长宁区 / Changning District 686.5140
## 6 金山区 / Jinshan District 694.3520
## 5 普陀区 / Putuo District 706.1377
## 4 黄浦区 / Huangpu District 714.2647
## 3 浦东新区 / Pudong 765.8575
## 2 奉贤区 / Fengxian District 822.6498
## 1 松江区 / Songjiang District 1019.5917
tema <- theme(
plot.title = element_text(size = 15, hjust = .5),
axis.text.x = element_text(size = 6, face = "bold"),
axis.text.y = element_text(size = 6, face = "bold"),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
legend.position = "none")
tema1 <- theme(
plot.title = element_text(size = 15, hjust = .5),
axis.text.x = element_text(size = 6, face = "bold"),
axis.text.y = element_text(size = 6, face = "bold"),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
legend.position="none")
options(repr.plot.width=20, repr.plot.height=11)
most_expensive_plot_a <- ggplot(data = top_10_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
geom_label(mapping = aes(label = round(Average_price_per_neighborhood, 2)), size = 3, fill = "#F5FFFA", fontface = "bold") +
coord_flip() +
theme_ipsum() +
ggtitle("TOP 10 most expensive neighborhoods in Shanghai City") +
xlab("") +
ylab("") +
tema
most_expensive_plot_b <- ggplot(data = top_10_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
theme_ipsum() +
ggtitle("TOP 10 most expensive neighborhoods in Shanghai City") +
xlab("") +
ylab("") +
tema1
plot_grid(most_expensive_plot_a, most_expensive_plot_b + coord_polar(), ncol=2, nrow=1)
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## 'Arial Narrow' not found in PostScript font database
### Top 10 Lowest Neighborhoods
top_10_lowest_neighbourhood <- aggregate(list(Airbnb_Shanghai_2021$price), list(Airbnb_Shanghai_2021$neighbourhood), mean)
colnames(top_10_lowest_neighbourhood) <- c("neighbourhood", "Average_price_per_neighborhood")
top_10_lowest_neighbourhood <- top_10_lowest_neighbourhood[order(top_10_lowest_neighbourhood$Average_price_per_neighborhood),]
top_10_lowest_neighbourhood
## neighbourhood Average_price_per_neighborhood
## 7 杨浦区 / Yangpu District 382.0541
## 3 宝山区 / Baoshan District 418.7676
## 10 虹口区 / Hongkou District 485.1142
## 1 嘉定区 / Jiading District 513.5099
## 13 闵行区 / Minhang District 604.6832
## 15 静安区 / Jing'an District 643.1443
## 5 徐汇区 / Xuhui District 648.9120
## 12 长宁区 / Changning District 686.5140
## 11 金山区 / Jinshan District 694.3520
## 6 普陀区 / Putuo District 706.1377
## 16 黄浦区 / Huangpu District 714.2647
## 9 浦东新区 / Pudong 765.8575
## 2 奉贤区 / Fengxian District 822.6498
## 8 松江区 / Songjiang District 1019.5917
## 14 青浦区 / Qingpu District 1159.4917
## 4 崇明区 / Chongming District 1556.2554
top_10_lowest_neighbourhood <- tail(top_10_lowest_neighbourhood, 10)
r <- c()
for(i in 1:10){r <- c(r, i)}
row.names(top_10_lowest_neighbourhood) <- r
top_10_lowest_neighbourhood
## neighbourhood Average_price_per_neighborhood
## 1 徐汇区 / Xuhui District 648.9120
## 2 长宁区 / Changning District 686.5140
## 3 金山区 / Jinshan District 694.3520
## 4 普陀区 / Putuo District 706.1377
## 5 黄浦区 / Huangpu District 714.2647
## 6 浦东新区 / Pudong 765.8575
## 7 奉贤区 / Fengxian District 822.6498
## 8 松江区 / Songjiang District 1019.5917
## 9 青浦区 / Qingpu District 1159.4917
## 10 崇明区 / Chongming District 1556.2554
tema <- theme(plot.title = element_text(size = 15, hjust = .5),
axis.text.x = element_text(size = 6, angle=15, face = "bold"),
axis.text.y = element_text(size = 6, angle=10, face = "bold"),
axis.title.x = element_text(size = 7),
axis.title.y = element_text(size = 7),
legend.position = "none")
options(repr.plot.width=20, repr.plot.height=10)
lowest_neighborhoods_plot_a <- ggplot(data = top_10_lowest_neighbourhood, mapping = aes(x = neighbourhood, y = Average_price_per_neighborhood)) +
geom_bar(stat = "identity", mapping = aes(fill = neighbourhood, color = neighbourhood), alpha = .8, size = .7) +
geom_label(mapping = aes(label = round(Average_price_per_neighborhood, 2)), size = 3, fill = "#F5FFFA", fontface = "bold") +
theme_ipsum() +
ggtitle("TOP 10 cheapest neighborhoods in Shanghai City ") +
xlab("") +
ylab("") +
tema
lowest_neighborhoods_plot_a
#############################################
# The Relationship between Price and Reviews#
#############################################
ggplot(Airbnb_Shanghai_2021, aes(number_of_reviews, price)) +
theme(axis.title = element_text(), axis.title.x = element_text()) +
geom_point(aes(size = price), alpha = 0.05, color = "red") +
background_canvas+
xlab("Number of reviews") +
ylab("Price") +
ggtitle("Relationship between prices number of reviews",
subtitle = "The most expensive houses have small number of reviews")
## The most expensive houses have small number of reviews
#############################################
# Map for Airbnb House Distribution #
#############################################
#ggmap - an object of class ggmap (from function get_map)
height <- max(Airbnb_Shanghai_2021$latitude) - min(Airbnb_Shanghai_2021$latitude)
width <- max(Airbnb_Shanghai_2021$longitude) - min(Airbnb_Shanghai_2021$longitude)
Canvas_borders <- c(bottom = min(Airbnb_Shanghai_2021$latitude) - 0.05 * height,
top = max(Airbnb_Shanghai_2021$latitude) + 0.05 * height,
left = min(Airbnb_Shanghai_2021$longitude) - 0.05 * width,
right = max(Airbnb_Shanghai_2021$longitude) + 0.05 * width)
# The full list of map types is “terrain”, “terrain-background”, “terrain-labels”, “terrain-lines”, “toner”, “toner-2010”, “toner-2011”, “toner-background”, “toner-hybrid”, “toner-labels”, “toner-lines”, “toner-lite”, “watercolor”.
map <- get_stamenmap(Canvas_borders, zoom = 10, maptype = "toner-lite")
## Source : http://tile.stamen.com/toner-lite/10/855/416.png
## Source : http://tile.stamen.com/toner-lite/10/856/416.png
## Source : http://tile.stamen.com/toner-lite/10/857/416.png
## Source : http://tile.stamen.com/toner-lite/10/858/416.png
## Source : http://tile.stamen.com/toner-lite/10/859/416.png
## Source : http://tile.stamen.com/toner-lite/10/855/417.png
## Source : http://tile.stamen.com/toner-lite/10/856/417.png
## Source : http://tile.stamen.com/toner-lite/10/857/417.png
## Source : http://tile.stamen.com/toner-lite/10/858/417.png
## Source : http://tile.stamen.com/toner-lite/10/859/417.png
## Source : http://tile.stamen.com/toner-lite/10/855/418.png
## Source : http://tile.stamen.com/toner-lite/10/856/418.png
## Source : http://tile.stamen.com/toner-lite/10/857/418.png
## Source : http://tile.stamen.com/toner-lite/10/858/418.png
## Source : http://tile.stamen.com/toner-lite/10/859/418.png
## Source : http://tile.stamen.com/toner-lite/10/855/419.png
## Source : http://tile.stamen.com/toner-lite/10/856/419.png
## Source : http://tile.stamen.com/toner-lite/10/857/419.png
## Source : http://tile.stamen.com/toner-lite/10/858/419.png
## Source : http://tile.stamen.com/toner-lite/10/859/419.png
## Source : http://tile.stamen.com/toner-lite/10/855/420.png
## Source : http://tile.stamen.com/toner-lite/10/856/420.png
## Source : http://tile.stamen.com/toner-lite/10/857/420.png
## Source : http://tile.stamen.com/toner-lite/10/858/420.png
## Source : http://tile.stamen.com/toner-lite/10/859/420.png
ggmap(map) +
geom_point(data = Airbnb_Shanghai_2021, mapping = aes(x = longitude, y = latitude,
col = log(price))) +
scale_color_distiller(palette = "RdYlGn", direction = 1)
#############################################
# Price Prediction #
#############################################
Airbnb_Shanghai_2021 <- select(Airbnb_Shanghai_2021,-c(neighbourhood_group))
Airbnb_Shanghai_2021
## # A tibble: 36,294 x 15
## id name host_id host_name neighbourhood latitude longitude room_type
## <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 24963 Hear… 98203 Jia 徐汇区 / Xuhui … 31.2 121. Entire h…
## 2 24991 Fren… 98203 Jia 徐汇区 / Xuhui … 31.2 121. Entire h…
## 3 139828 【sid… 681552 Leon 普陀区 / Putuo … 31.2 121. Entire h…
## 4 161932 Subl… 774393 Michael 静安区 / Jing'a… 31.2 121. Entire h…
## 5 185736 Apt … 891951 Maggie 徐汇区 / Xuhui … 31.2 121. Private …
## 6 350728 'Lao… 1777552 Nitin 长宁区 / Changn… 31.2 121. Private …
## 7 427038 In t… 2122588 Mia 黄浦区 / Huangp… 31.2 121. Private …
## 8 479517 有简约 … 681552 Leon 静安区 / Jing'a… 31.2 121. Entire h…
## 9 479530 【sid… 681552 Leon 静安区 / Jing'a… 31.2 121. Entire h…
## 10 496972 Free… 2454164 Alvin 杨浦区 / Yangpu… 31.3 121. Private …
## # … with 36,284 more rows, and 7 more variables: price <dbl>,
## # minimum_nights <dbl>, number_of_reviews <dbl>, last_review <date>,
## # reviews_per_month <dbl>, calculated_host_listings_count <dbl>,
## # availability_365 <dbl>
Airbnb_Shanghai_2021 <- Airbnb_Shanghai_2021 %>% mutate(id = row_number())
airbnb_train <- Airbnb_Shanghai_2021 %>% sample_frac(.7) %>% filter(price > 0)
airbnb_test <- anti_join(Airbnb_Shanghai_2021, airbnb_train, by = 'id') %>% filter(price > 0)
head(airbnb_train)
## # A tibble: 6 x 15
## id name host_id host_name neighbourhood latitude longitude room_type price
## <int> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 33147 (奕可3… 2.83e8 蔡@君 浦东新区 / Pudon… 31.1 122. Entire h… 424
## 2 6263 [可长租… 7.63e7 小麦 浦东新区 / Pudon… 31.1 122. Entire h… 328
## 3 35248 【特惠八… 3.04e8 Mizao Di… 浦东新区 / Pudon… 31.1 122. Entire h… 3826
## 4 27799 Skyv… 3.85e6 Hope 黄浦区 / Huangp… 31.2 121. Private … 316
## 5 25029 【五星推… 1.62e8 Ruiyin 崇明区 / Chongm… 31.7 121. Private … 498
## 6 6663 上海杨浦… 1.04e8 闲主 杨浦区 / Yangpu… 31.3 122. Entire h… 279
## # … with 6 more variables: minimum_nights <dbl>, number_of_reviews <dbl>,
## # last_review <date>, reviews_per_month <dbl>,
## # calculated_host_listings_count <dbl>, availability_365 <dbl>
# sanity check
nrow(airbnb_train) + nrow(airbnb_test) == nrow(Airbnb_Shanghai_2021 %>% filter(price > 0))
## [1] TRUE
#Model1
model1<- lm(log(price)~ minimum_nights+number_of_reviews+reviews_per_month+availability_365+latitude+longitude+calculated_host_listings_count+room_type+neighbourhood, data = airbnb_train)
summary(model1)
##
## Call:
## lm(formula = log(price) ~ minimum_nights + number_of_reviews +
## reviews_per_month + availability_365 + latitude + longitude +
## calculated_host_listings_count + room_type + neighbourhood,
## data = airbnb_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3906 -0.4454 -0.1145 0.3010 6.3838
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.915e+01 1.757e+01 2.228 0.02587
## minimum_nights -1.533e-04 2.389e-04 -0.642 0.52094
## number_of_reviews 6.889e-05 2.659e-04 0.259 0.79561
## reviews_per_month -6.459e-02 7.428e-03 -8.695 < 2e-16
## availability_365 -1.000e-04 4.637e-05 -2.157 0.03103
## latitude -4.289e-01 1.625e-01 -2.640 0.00831
## longitude -1.638e-01 1.161e-01 -1.411 0.15824
## calculated_host_listings_count -4.034e-04 2.026e-04 -1.991 0.04648
## room_typePrivate room -5.965e-01 1.379e-02 -43.267 < 2e-16
## room_typeShared room -1.346e+00 3.903e-02 -34.485 < 2e-16
## neighbourhood奉贤区 / Fengxian District 1.285e-01 9.920e-02 1.295 0.19540
## neighbourhood宝山区 / Baoshan District 4.853e-02 6.927e-02 0.701 0.48355
## neighbourhood崇明区 / Chongming District 1.464e+00 9.549e-02 15.335 < 2e-16
## neighbourhood徐汇区 / Xuhui District 3.138e-01 5.274e-02 5.950 2.75e-09
## neighbourhood普陀区 / Putuo District 2.670e-01 6.457e-02 4.135 3.57e-05
## neighbourhood杨浦区 / Yangpu District 1.913e-01 6.673e-02 2.866 0.00416
## neighbourhood松江区 / Songjiang District 2.185e-01 6.948e-02 3.144 0.00167
## neighbourhood浦东新区 / Pudong 5.248e-01 5.860e-02 8.955 < 2e-16
## neighbourhood虹口区 / Hongkou District 2.573e-01 6.203e-02 4.147 3.39e-05
## neighbourhood金山区 / Jinshan District 6.232e-01 1.336e-01 4.663 3.14e-06
## neighbourhood长宁区 / Changning District 2.354e-01 5.730e-02 4.109 4.00e-05
## neighbourhood闵行区 / Minhang District 1.233e-01 5.892e-02 2.093 0.03639
## neighbourhood青浦区 / Qingpu District 6.434e-01 6.623e-02 9.715 < 2e-16
## neighbourhood静安区 / Jing'an District 3.073e-01 5.346e-02 5.747 9.28e-09
## neighbourhood黄浦区 / Huangpu District 4.409e-01 5.207e-02 8.467 < 2e-16
##
## (Intercept) *
## minimum_nights
## number_of_reviews
## reviews_per_month ***
## availability_365 *
## latitude **
## longitude
## calculated_host_listings_count *
## room_typePrivate room ***
## room_typeShared room ***
## neighbourhood奉贤区 / Fengxian District
## neighbourhood宝山区 / Baoshan District
## neighbourhood崇明区 / Chongming District ***
## neighbourhood徐汇区 / Xuhui District ***
## neighbourhood普陀区 / Putuo District ***
## neighbourhood杨浦区 / Yangpu District **
## neighbourhood松江区 / Songjiang District **
## neighbourhood浦东新区 / Pudong ***
## neighbourhood虹口区 / Hongkou District ***
## neighbourhood金山区 / Jinshan District ***
## neighbourhood长宁区 / Changning District ***
## neighbourhood闵行区 / Minhang District *
## neighbourhood青浦区 / Qingpu District ***
## neighbourhood静安区 / Jing'an District ***
## neighbourhood黄浦区 / Huangpu District ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7267 on 13692 degrees of freedom
## (11683 observations deleted due to missingness)
## Multiple R-squared: 0.225, Adjusted R-squared: 0.2237
## F-statistic: 165.7 on 24 and 13692 DF, p-value: < 2.2e-16
AIC(model1)
## [1] 30194.74
BIC(model1)
## [1] 30390.43
airbnb_trained_filtered <- airbnb_train %>% filter(price < quantile(airbnb_train$price, 0.9) & price > quantile(airbnb_train$price, 0.1))%>%drop_na()
# learn
# View(learn)
## Model2
model2<- lm(log(price)~ number_of_reviews+reviews_per_month+availability_365+latitude+calculated_host_listings_count+room_type+neighbourhood, data = airbnb_trained_filtered)
summary(model2)
##
## Call:
## lm(formula = log(price) ~ number_of_reviews + reviews_per_month +
## availability_365 + latitude + calculated_host_listings_count +
## room_type + neighbourhood, data = airbnb_trained_filtered)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.12889 -0.31352 -0.04667 0.27337 1.45660
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.751e-01 2.742e+00 0.356 0.722185
## number_of_reviews 7.536e-04 1.719e-04 4.383 1.18e-05
## reviews_per_month -5.387e-02 4.752e-03 -11.336 < 2e-16
## availability_365 -5.567e-05 3.036e-05 -1.834 0.066726
## latitude 1.552e-01 8.759e-02 1.772 0.076454
## calculated_host_listings_count -2.518e-04 1.277e-04 -1.972 0.048636
## room_typePrivate room -2.573e-01 9.177e-03 -28.032 < 2e-16
## room_typeShared room -4.773e-01 5.315e-02 -8.980 < 2e-16
## neighbourhood奉贤区 / Fengxian District -7.146e-02 6.815e-02 -1.049 0.294354
## neighbourhood宝山区 / Baoshan District -3.859e-02 4.642e-02 -0.831 0.405743
## neighbourhood崇明区 / Chongming District 4.552e-01 4.838e-02 9.409 < 2e-16
## neighbourhood徐汇区 / Xuhui District 2.464e-01 3.510e-02 7.018 2.37e-12
## neighbourhood普陀区 / Putuo District 1.134e-01 4.340e-02 2.614 0.008965
## neighbourhood杨浦区 / Yangpu District 6.057e-02 4.112e-02 1.473 0.140721
## neighbourhood松江区 / Songjiang District 5.653e-02 4.515e-02 1.252 0.210608
## neighbourhood浦东新区 / Pudong 2.199e-01 3.488e-02 6.305 2.98e-10
## neighbourhood虹口区 / Hongkou District 1.080e-01 3.887e-02 2.779 0.005465
## neighbourhood金山区 / Jinshan District 5.822e-01 8.096e-02 7.191 6.82e-13
## neighbourhood长宁区 / Changning District 1.408e-01 3.827e-02 3.680 0.000234
## neighbourhood闵行区 / Minhang District 1.272e-01 3.985e-02 3.191 0.001424
## neighbourhood青浦区 / Qingpu District 3.806e-01 4.168e-02 9.132 < 2e-16
## neighbourhood静安区 / Jing'an District 1.639e-01 3.499e-02 4.684 2.85e-06
## neighbourhood黄浦区 / Huangpu District 2.829e-01 3.365e-02 8.407 < 2e-16
##
## (Intercept)
## number_of_reviews ***
## reviews_per_month ***
## availability_365 .
## latitude .
## calculated_host_listings_count *
## room_typePrivate room ***
## room_typeShared room ***
## neighbourhood奉贤区 / Fengxian District
## neighbourhood宝山区 / Baoshan District
## neighbourhood崇明区 / Chongming District ***
## neighbourhood徐汇区 / Xuhui District ***
## neighbourhood普陀区 / Putuo District **
## neighbourhood杨浦区 / Yangpu District
## neighbourhood松江区 / Songjiang District
## neighbourhood浦东新区 / Pudong ***
## neighbourhood虹口区 / Hongkou District **
## neighbourhood金山区 / Jinshan District ***
## neighbourhood长宁区 / Changning District ***
## neighbourhood闵行区 / Minhang District **
## neighbourhood青浦区 / Qingpu District ***
## neighbourhood静安区 / Jing'an District ***
## neighbourhood黄浦区 / Huangpu District ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4318 on 11305 degrees of freedom
## Multiple R-squared: 0.1204, Adjusted R-squared: 0.1187
## F-statistic: 70.33 on 22 and 11305 DF, p-value: < 2.2e-16
AIC(model2)
## [1] 13146.25
BIC(model2)
## [1] 13322.29
[1] https://rstudio.github.io/DT/ DT package instruction
[2] https://cran.r-project.org/web/packages/hrbrthemes/hrbrthemes.pdf hrbrthemes package instruction
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.